Set/check knitR option and working directory

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(harrietr)
## Registered S3 method overwritten by 'treeio':
##   method     from
##   root.phylo ape
library(here)
## here() starts at /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github"
rm(list = ls())

Import raw data

Snippy output

snippy_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_pair_id_snps.mask.tab") %>%
  arrange(PAIR_ID)
## Parsed with column specification:
## cols(
##   PAIR_ID = col_character(),
##   REFERENCE = col_character(),
##   ISOLATE = col_character(),
##   CHROM = col_character(),
##   POS = col_double(),
##   TYPE = col_character(),
##   REF = col_character(),
##   ALT = col_character(),
##   EVIDENCE = col_character(),
##   FTYPE = col_character(),
##   STRAND = col_character(),
##   NT_POS = col_character(),
##   AA_POS = col_character(),
##   EFFECT = col_character(),
##   LOCUS_TAG = col_character(),
##   GENE = col_character(),
##   PRODUCT = col_character()
## )
snippy_data 
## # A tibble: 142,030 x 17
##    PAIR_ID REFERENCE ISOLATE CHROM   POS TYPE  REF   ALT   EVIDENCE FTYPE STRAND
##    <chr>   <chr>     <chr>   <chr> <dbl> <chr> <chr> <chr> <chr>    <chr> <chr> 
##  1 GP-0002 BPH2705.… BPH2704 cont… 32669 snp   T     A     A:23 T:0 <NA>  <NA>  
##  2 GP-0002 BPH2705.… BPH2704 cont…    14 snp   G     T     T:29 G:0 <NA>  <NA>  
##  3 GP-0002 BPH2705.… BPH2704 cont…   521 snp   C     A     A:23 C:0 <NA>  <NA>  
##  4 GP-0002 BPH2705.… BPH2704 cont…   389 snp   T     A     A:18 T:0 <NA>  <NA>  
##  5 GP-0002 BPH2705.… BPH2704 cont… 61262 snp   A     C     C:25 A:0 CDS   +     
##  6 GP-0002 BPH2705.… BPH2704 cont… 59956 snp   C     A     A:21 C:0 CDS   +     
##  7 GP-0002 BPH2705.… BPH2704 cont… 38034 snp   C     A     A:41 C:0 CDS   +     
##  8 GP-0002 BPH2705.… BPH2704 cont… 37962 snp   A     C     C:38 A:0 CDS   +     
##  9 GP-0002 BPH2705.… BPH2704 cont…  2728 snp   A     T     T:18 A:0 <NA>  <NA>  
## 10 GP-0002 BPH2705.… BPH2704 cont…  2695 snp   C     A     A:20 C:0 <NA>  <NA>  
## # … with 142,020 more rows, and 6 more variables: NT_POS <chr>, AA_POS <chr>,
## #   EFFECT <chr>, LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>
# modify snippy output: 
# 1) generate iso1 and iso2 for merging with the phenotypic analysis
# 2) modify CHROM (contig name) to be consistent with the labelling in bed files down the track
# 3) extract mutation effects for easier interpretation


source("../Functions/aa_convert.R")
snippy_data_modified <- snippy_data %>%
  mutate(iso1 = str_remove(REFERENCE, ".gbk"),
         iso2 = ISOLATE,
         CHROM = str_c(iso1, "_", CHROM)) %>%
  separate(EFFECT, 
           into = c("EFFTYPE", "NUCLEOTIDE_CHANGE", "MUTATION"), 
           sep = "\\s", 
           remove = T, 
           extra = "merge") %>%
  mutate(NUCLEOTIDE_CHANGE = str_remove(NUCLEOTIDE_CHANGE, "c."),
         MUTATION = str_remove(MUTATION, "p."),
         MUTATION_SHORT = aa_convert(MUTATION)) %>%
  separate(AA_POS, 
           into = c("AA_POS", "AA_LENGTH"), 
           sep = "/", 
           remove = T) %>%
  mutate_at(vars(starts_with("AA")), 
            as.numeric)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 488 rows [458,
## 459, 554, 555, 940, 941, 942, 943, 944, 994, 995, 996, 997, 998, 1048, 1049,
## 1050, 1051, 1052, 5899, ...].
## Parsed with column specification:
## cols(
##   `Amino acid` = col_character(),
##   `Three letter code` = col_character(),
##   `One letter code` = col_character()
## )
snippy_data_modified
## # A tibble: 142,030 x 23
##    PAIR_ID REFERENCE ISOLATE CHROM   POS TYPE  REF   ALT   EVIDENCE FTYPE STRAND
##    <chr>   <chr>     <chr>   <chr> <dbl> <chr> <chr> <chr> <chr>    <chr> <chr> 
##  1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp   T     A     A:23 T:0 <NA>  <NA>  
##  2 GP-0002 BPH2705.… BPH2704 BPH2…    14 snp   G     T     T:29 G:0 <NA>  <NA>  
##  3 GP-0002 BPH2705.… BPH2704 BPH2…   521 snp   C     A     A:23 C:0 <NA>  <NA>  
##  4 GP-0002 BPH2705.… BPH2704 BPH2…   389 snp   T     A     A:18 T:0 <NA>  <NA>  
##  5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp   A     C     C:25 A:0 CDS   +     
##  6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp   C     A     A:21 C:0 CDS   +     
##  7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp   C     A     A:41 C:0 CDS   +     
##  8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp   A     C     C:38 A:0 CDS   +     
##  9 GP-0002 BPH2705.… BPH2704 BPH2…  2728 snp   A     T     T:18 A:0 <NA>  <NA>  
## 10 GP-0002 BPH2705.… BPH2704 BPH2…  2695 snp   C     A     A:20 C:0 <NA>  <NA>  
## # … with 142,020 more rows, and 12 more variables: NT_POS <chr>, AA_POS <dbl>,
## #   AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## #   LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## #   MUTATION_SHORT <chr>
snippy_data_modified %>%
  .$ISOLATE %>%
  n_distinct()
## [1] 253
df_n_mutations <- snippy_data_modified %>%
  count(PAIR_ID, sort = T)
df_n_mutations %>%
  ggplot(aes(x = n)) +
  # geom_histogram() +
  geom_density() +
  theme_bw()

distant_pairs <- df_n_mutations %>%
  filter(n > 100) %>%
  .$PAIR_ID
snippy_data_modified_no_distant_pairs <- snippy_data_modified %>%
  filter(!PAIR_ID %in% distant_pairs)
rm(distant_pairs, df_n_mutations)

clustering of protein genes (cd-hit)

protein_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.cd-hit.tab") %>%
  rename_all(funs(str_c(., "_prot"))) 
## Parsed with column specification:
## cols(
##   id = col_character(),
##   clstr = col_double(),
##   clstr_size = col_double(),
##   length = col_double(),
##   clstr_rep = col_double(),
##   clstr_iden = col_character(),
##   clstr_cov = col_character()
## )
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
nebraska_clusters <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/representative_proteins_FPR3757_cd-hit.tab") %>%
  mutate(source = if_else(str_detect(id, "SAUSA300"), "FPR3757", "mutated_proteins")) %>%
  group_by(clstr) %>%
  filter(any(str_detect(id, "BPH"))) %>%
  group_by(clstr, source) %>%
  arrange(desc(clstr_cov, clstr_id)) %>%
  filter(!(source == "FPR3757" & row_number() > 1)) %>%
  mutate(source_long = str_c(source, "_", row_number())) %>%
  ungroup() %>%
  select(id, clstr, source_long) %>%
  pivot_wider(names_from = source_long, values_from = id) %>%
  pivot_longer(cols = starts_with("mutated_proteins"), names_to = "source_long", values_to = "id_prot") %>%
  drop_na(id_prot) %>%
  select(id_prot, nebraska_locus_tag = FPR3757_1) 
## Parsed with column specification:
## cols(
##   id = col_character(),
##   clstr = col_double(),
##   clstr_size = col_double(),
##   length = col_double(),
##   clstr_rep = col_double(),
##   clstr_iden = col_character(),
##   clstr_cov = col_character()
## )
df_neb <- read_csv("Ideas_Grant_2020_analysis/Raw_data/nebraska_all_proteins.csv") %>%
  rename(neb_mutant_id = `Strain Name`, neb_gene = `Gene name`, neb_product = `gene discription`)
## Parsed with column specification:
## cols(
##   `Strain Name` = col_character(),
##   plate384 = col_double(),
##   Well384 = col_character(),
##   `Gene name` = col_character(),
##   `gene discription` = col_character(),
##   nebraska_locus_tag = col_character(),
##   Row384 = col_character(),
##   Column384 = col_double(),
##   nebraska_aa_seq = col_character()
## )
protein_clusters_data <- protein_clusters_data %>%
  left_join(nebraska_clusters) %>%
  group_by(clstr_prot) %>%
  mutate(nebraska_locus_tag = nebraska_locus_tag[which(clstr_rep_prot == 1)]) %>%
  left_join(df_neb)
## Joining, by = "id_prot"
## Joining, by = "nebraska_locus_tag"
protein_seq_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.tab") %>%
  rename_all(funs(str_c(., "_prot")))
## Parsed with column specification:
## cols(
##   FASTA_ID = col_character(),
##   SEQUENCE = col_character()
## )

merge clusters and sequences with snippy data

snippy_data_modified_proteins <- snippy_data_modified_no_distant_pairs %>%
  left_join(protein_clusters_data,
            by = c("LOCUS_TAG" = "id_prot")) %>%
  left_join(protein_seq_data,
            by = c("LOCUS_TAG" = "FASTA_ID_prot"))
snippy_data_modified_proteins
## # A tibble: 104,035 x 39
##    PAIR_ID REFERENCE ISOLATE CHROM   POS TYPE  REF   ALT   EVIDENCE FTYPE STRAND
##    <chr>   <chr>     <chr>   <chr> <dbl> <chr> <chr> <chr> <chr>    <chr> <chr> 
##  1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp   T     A     A:23 T:0 <NA>  <NA>  
##  2 GP-0002 BPH2705.… BPH2704 BPH2…    14 snp   G     T     T:29 G:0 <NA>  <NA>  
##  3 GP-0002 BPH2705.… BPH2704 BPH2…   521 snp   C     A     A:23 C:0 <NA>  <NA>  
##  4 GP-0002 BPH2705.… BPH2704 BPH2…   389 snp   T     A     A:18 T:0 <NA>  <NA>  
##  5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp   A     C     C:25 A:0 CDS   +     
##  6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp   C     A     A:21 C:0 CDS   +     
##  7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp   C     A     A:41 C:0 CDS   +     
##  8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp   A     C     C:38 A:0 CDS   +     
##  9 GP-0002 BPH2705.… BPH2704 BPH2…  2728 snp   A     T     T:18 A:0 <NA>  <NA>  
## 10 GP-0002 BPH2705.… BPH2704 BPH2…  2695 snp   C     A     A:20 C:0 <NA>  <NA>  
## # … with 104,025 more rows, and 28 more variables: NT_POS <chr>, AA_POS <dbl>,
## #   AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## #   LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## #   MUTATION_SHORT <chr>, clstr_prot <dbl>, clstr_size_prot <dbl>,
## #   length_prot <dbl>, clstr_rep_prot <dbl>, clstr_iden_prot <chr>,
## #   clstr_cov_prot <chr>, nebraska_locus_tag <chr>, neb_mutant_id <chr>,
## #   plate384 <dbl>, Well384 <chr>, neb_gene <chr>, neb_product <chr>,
## #   Row384 <chr>, Column384 <dbl>, nebraska_aa_seq <chr>, SEQUENCE_prot <chr>

clustering and annotation of intergenic regions

gff with annotation of intergenic regions

col_names <- c("CHROM_intergenic", 
               "START", 
               "END", 
               "CHROM_protein", 
               "SOURCE", 
               "TYPE", 
               "START_flank_prot", 
               "END_flank_prot", 
               "SCORE", 
               "STRAND_flank_prot", 
               "PHASE", 
               "ATTRIBUTES_flank_prot",  
               "CHROM_mutation",
               "POS_minus1",
               "POS", 
               "PAIR_ID", 
               "REFERENCE", 
               "ISOLATE")

intergenic_regions_annotated <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.annotated.bed",
                                         col_names = col_names) %>%
  select(CHROM = CHROM_intergenic, 
         START, 
         END, 
         START_flank_prot, 
         END_flank_prot, 
         STRAND_flank_prot, 
         ATTRIBUTES_flank_prot, 
         POS, 
         PAIR_ID, 
         REFERENCE, 
         ISOLATE) 
## Parsed with column specification:
## cols(
##   CHROM_intergenic = col_character(),
##   START = col_double(),
##   END = col_double(),
##   CHROM_protein = col_character(),
##   SOURCE = col_character(),
##   TYPE = col_character(),
##   START_flank_prot = col_double(),
##   END_flank_prot = col_double(),
##   SCORE = col_character(),
##   STRAND_flank_prot = col_character(),
##   PHASE = col_double(),
##   ATTRIBUTES_flank_prot = col_character(),
##   CHROM_mutation = col_character(),
##   POS_minus1 = col_double(),
##   POS = col_double(),
##   PAIR_ID = col_character(),
##   REFERENCE = col_character(),
##   ISOLATE = col_character()
## )
upstream_proteins <- intergenic_regions_annotated %>%
  group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
  filter(END_flank_prot < POS) %>%
  rename_at(vars(ends_with("_flank_prot")),
            funs(str_c(., "_upstream")))
downstream_proteins <- intergenic_regions_annotated %>%
  group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
  filter(START_flank_prot > POS) %>%
  rename_at(vars(ends_with("_flank_prot")),
            funs(str_c(., "_downstream")))
intergenic_regions_annotated <- upstream_proteins %>%
  left_join(downstream_proteins)
## Joining, by = c("CHROM", "START", "END", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")
intergenic_regions_annotated
## # A tibble: 42,815 x 15
## # Groups:   CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE [42,815]
##    CHROM  START    END START_flank_pro… END_flank_prot_… STRAND_flank_pr…
##    <chr>  <dbl>  <dbl>            <dbl>            <dbl> <chr>           
##  1 BPH2… 213895 214187           212549           213895 +               
##  2 BPH2… 162407 162758           162189           162407 -               
##  3 BPH2… 162407 162758           162189           162407 -               
##  4 BPH2… 162407 162758           162189           162407 -               
##  5 BPH2… 174634 175871           174428           174634 +               
##  6 BPH2… 174634 175871           174428           174634 +               
##  7 BPH2… 178465 178709           178217           178465 +               
##  8 BPH2… 181179 181441           180931           181179 +               
##  9 BPH2… 231689 232169           230541           231689 +               
## 10 BPH2…  98412  98874            97246            98412 -               
## # … with 42,805 more rows, and 9 more variables:
## #   ATTRIBUTES_flank_prot_upstream <chr>, POS <dbl>, PAIR_ID <chr>,
## #   REFERENCE <chr>, ISOLATE <chr>, START_flank_prot_downstream <dbl>,
## #   END_flank_prot_downstream <dbl>, STRAND_flank_prot_downstream <chr>,
## #   ATTRIBUTES_flank_prot_downstream <chr>
rm(upstream_proteins, downstream_proteins)

cd-hit clustering of intergenic regions

intergenic_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.cd-hit.tab") %>%
  separate(id, into = c("CHROM", "START", "END"), sep = "[:-]", remove = F) %>%
  rename_at(vars(id, starts_with("clstr")),
            funs(str_c(., "_intergenic"))) %>%
  mutate_at(vars(START, END), as.numeric) %>%
  distinct()
## Parsed with column specification:
## cols(
##   id = col_character(),
##   clstr = col_double(),
##   clstr_size = col_double(),
##   length = col_double(),
##   clstr_rep = col_double(),
##   clstr_iden = col_character(),
##   clstr_cov = col_character()
## )
snippy_data_modified_proteins_intergenic_regions <- snippy_data_modified_proteins %>%
  left_join(intergenic_regions_annotated,
            by = c("CHROM", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")) %>%
  left_join(intergenic_clusters_data)
## Joining, by = c("CHROM", "START", "END")

Mortality switches and phenotypes

df_phenotypes <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/genetic_pairs_pheno_changes_mortality_switches.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso1 = col_character(),
##   iso1_ST = col_character(),
##   iso1_strain_group = col_character(),
##   iso1_mortality = col_character(),
##   iso1_included = col_logical(),
##   iso1_scv = col_logical(),
##   iso1_persister_type = col_character(),
##   iso1_sample_type = col_character(),
##   iso1_genes = col_character(),
##   iso1_CC = col_character(),
##   iso2_ST = col_character(),
##   iso2_strain_group = col_character(),
##   iso2_mortality = col_character(),
##   iso2_included = col_logical(),
##   iso2_scv = col_logical(),
##   iso2_persister_type = col_character(),
##   iso2_sample_type = col_character(),
##   iso2_genes = col_character(),
##   iso2_CC = col_character()
##   # ... with 2 more columns
## )
## See spec(...) for full column specifications.
# check available phenotypes
# mortality
df_phenotypes %>%
  select(iso1, iso2, switches) %>%
  distinct() %>%
  count(switches)
## # A tibble: 5 x 2
##   switches              n
##   <chr>             <int>
## 1 Died-Died            42
## 2 Died-Survived        20
## 3 Survived-Died        20
## 4 Survived-Survived   146
## 5 <NA>                  6
# most mortality phenotypes are not available. Need to recalculate that
df_all_genetic_pairs_pheno <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/df_all_genetic_pairs_pheno.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso1 = col_character(),
##   iso1_ST = col_character(),
##   iso1_strain_group = col_character(),
##   iso1_mortality = col_character(),
##   iso1_included = col_logical(),
##   iso1_scv = col_logical(),
##   iso1_persister_type = col_character(),
##   iso1_sample_type = col_character(),
##   iso1_genes = col_character(),
##   iso1_CC = col_character(),
##   iso2_ST = col_character(),
##   iso2_strain_group = col_character(),
##   iso2_mortality = col_character(),
##   iso2_included = col_logical(),
##   iso2_scv = col_logical(),
##   iso2_persister_type = col_character(),
##   iso2_sample_type = col_character(),
##   iso2_genes = col_character(),
##   iso2_CC = col_character()
##   # ... with 1 more columns
## )
## See spec(...) for full column specifications.
df_all_genetic_pairs_pheno %>%
  select(iso1, iso2, iso1_mortality, iso2_mortality) %>%
  filter(is.na(iso1_mortality) | is.na(iso2_mortality))
## # A tibble: 90 x 4
##    iso1    iso2    iso1_mortality iso2_mortality
##    <chr>   <chr>   <chr>          <chr>         
##  1 BPH2751 BPH2750 Died           <NA>          
##  2 BPH2750 BPH2751 <NA>           Died          
##  3 BPH2888 BPH2887 Died           <NA>          
##  4 BPH2887 BPH2888 <NA>           Died          
##  5 BPH2898 BPH2897 Died           <NA>          
##  6 BPH2897 BPH2898 <NA>           Died          
##  7 BPH2966 BPH2965 Died           <NA>          
##  8 BPH2965 BPH2966 <NA>           Died          
##  9 BPH2988 BPH2987 Died           <NA>          
## 10 BPH2987 BPH2988 <NA>           Died          
## # … with 80 more rows
df_all_genetic_pairs_pheno_switches <- df_all_genetic_pairs_pheno %>%
  mutate(switches = str_c(iso1_mortality, "-", iso2_mortality))
 
# count
df_all_genetic_pairs_pheno_switches %>%
  select(iso1, iso2, switches) %>%
  distinct() %>%
  count(switches)
## # A tibble: 5 x 2
##   switches              n
##   <chr>             <int>
## 1 Died-Died            68
## 2 Died-Survived       316
## 3 Survived-Died       316
## 4 Survived-Survived  1694
## 5 <NA>                 90
# df_mutations_phenotypes <- snippy_data_modified_proteins_intergenic_regions %>%
#  left_join(df_phenotypes)

Analyse mortality switches

No intergenic for now until we have fixed the merging issues above

df_mutations_phenotypes_mortality <- snippy_data_modified_proteins %>%
  left_join(df_all_genetic_pairs_pheno_switches)
## Joining, by = c("iso1", "iso2")
# convergence_all <- df_mutations_phenotypes_mortality %>%
#   group_by(clstr_prot) %>%
#   summarise(GENE = str_c(unique(GENE), collapse = "|"), 
#             n = n_distinct(PAIR_ID),
#             switches = str_c(switches, collapse = "|"),
#             mutations = str_c(unique(MUTATION_SHORT), collapse = "|"))

# mortality_convergence <- df_mutations_phenotypes_mortality %>%
#   group_by(switches, clstr_prot) %>%
#   summarise(n = n_distinct(PAIR_ID),
#             GENE = str_c(unique(GENE), collapse = "|"),
#             pairs = str_c(str_c(iso1, "-", iso2), collapse = "|"),
#             mutations = str_c(unique(MUTATION_SHORT), collapse = "|"))

Try with smaller dataset

# df_phenotypes_no_dup <- df_phenotypes %>%
#   rowwise() %>%
#   mutate(key = paste(sort(c(iso1, iso2, switches)), collapse = "")) %>%
#   #select(iso1, iso2, switches, key)
#   distinct(key, .keep_all = T) %>%
#   ungroup()
# 
# df_mutations_phenotypes_no_dup <- snippy_data_modified_proteins %>%
#   right_join(df_phenotypes_no_dup)
# 
# convergence_all <- df_mutations_phenotypes_no_dup %>%
#   drop_na(clstr_prot) %>%
#   group_by(clstr_prot, clstr_size_prot) %>%
#   summarise(gene = str_c(unique(GENE), collapse = "|"),
#             n = n_distinct(PAIR_ID),
#             switches = str_c(switches, collapse = "|"))
# 
# convergence_mortality <- df_mutations_phenotypes_no_dup %>%
#   drop_na(clstr_prot) %>%
#   filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
#   group_by(clstr_prot, clstr_size_prot) %>%
#   summarise(gene = str_c(unique(GENE), collapse = "|"),
#             product = str_c(unique(PRODUCT), collapse = "|"),
#             mutations = str_c(unique(MUTATION_SHORT), collapse = "|"),
#             n = n_distinct(PAIR_ID), n_references = n_distinct(REFERENCE))
# 
# convergence_mortality_control <- df_mutations_phenotypes_no_dup %>%
#   drop_na(clstr_prot) %>%
#   filter(switches %in% c("Survived-Survived", "Died-Died")) %>%
#   group_by(clstr_prot, clstr_size_prot) %>%
#   summarise(gene = str_c(unique(GENE), collapse = "|"),
#             product = str_c(unique(PRODUCT), collapse = "|"),
#             mutations = str_c(unique(MUTATION_SHORT), collapse = "|"),
#             n = n_distinct(PAIR_ID), n_references = n_distinct(REFERENCE))

Try again on the large dataset

Select pairs

We need a non-symmetric dataset

require(harrietr)

snp.dist.mat <- read.csv("Data_analysis/Genetic_pairs_analysis/VANANZ_core.aln.dist.mat", sep = "\t")  %>%
  as.matrix(.)
str(snp.dist.mat)
##  chr [1:844, 1:845] "BPH2700" "BPH2701" "BPH2702" "BPH2703" "BPH2704" ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:845] "snp.dists.0.6.3" "BPH2700" "BPH2701" "BPH2702" ...
# put 1st column as row name and remove 1st column
row.names(snp.dist.mat) <- snp.dist.mat[,1]
snp.dist.mat <- snp.dist.mat[,2:845]
str(snp.dist.mat)
##  chr [1:844, 1:844] "    0" "18224" " 9375" " 9267" "25534" "25539" "29327" ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:844] "BPH2700" "BPH2701" "BPH2702" "BPH2703" ...
##   ..$ : chr [1:844] "BPH2700" "BPH2701" "BPH2702" "BPH2703" ...
snp.dist.df <- harrietr::melt_dist(snp.dist.mat) %>%
  mutate(dist = as.integer(dist)) %>%
  filter(iso1 != "Reference") # remove reference
## Warning: attributes are not identical across measure variables;
## they will be dropped
str(snp.dist.df)
## 'data.frame':    355501 obs. of  3 variables:
##  $ iso1: chr  "BPH2701" "BPH2702" "BPH2703" "BPH2704" ...
##  $ iso2: chr  "BPH2700" "BPH2700" "BPH2700" "BPH2700" ...
##  $ dist: int  18224 9375 9267 25534 25539 29327 9232 9190 9210 11106 ...
close_strain.df <- snp.dist.df %>%
  filter(dist <= 30)
nrow(close_strain.df)
## [1] 1242
# check that there are no duplicate pairs
df_close_pairs <- close_strain.df %>%
  distinct(iso1, iso2)
nrow(df_close_pairs)
## [1] 1242

Filter existing dataset of mutations

# df_mutations_phenotypes_mortality_non_symmetric <- df_mutations_phenotypes_mortality %>%
#   right_join(close_strain.df)
# 
# df_convergence_mortality_non_symmetric <- df_mutations_phenotypes_mortality %>%
#   drop_na(clstr_prot) %>%
#   filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
#   group_by(clstr_prot, clstr_size_prot) %>%
#   summarise(gene = str_c(unique(GENE), collapse = "|"),
#             product = str_c(unique(PRODUCT), collapse = "|"),
#             mutations = str_c(unique(MUTATION_SHORT), collapse = "|"),
#             n = n_distinct(PAIR_ID), n_references = n_distinct(REFERENCE))
# 
# df_convergence_mortality_non_symmetric_long <- df_mutations_phenotypes_mortality %>%
#   drop_na(clstr_prot) %>%
#   filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
#   filter(EFFTYPE != "synonymous_variant") %>%
#   group_by(clstr_prot, clstr_size_prot) %>%
#   mutate(n = n_distinct(PAIR_ID), 
#          n_references = n_distinct(REFERENCE)) %>%
#   select(n_references, n, everything()) %>%
#   filter(n_references > 1)

Try creating independent pairs for mortality switches

df_mortality_close_pairs_indep <- df_all_genetic_pairs_pheno_switches %>%
  right_join(df_close_pairs) %>%
  filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
  arrange(iso1) %>%
  mutate(same_iso1 = if_else(row_number() == 1, FALSE, lag(iso1) == iso1)) %>%
  filter(!same_iso1) %>%
  arrange(iso2) %>%
  mutate(same_iso2 = if_else(row_number() == 1, FALSE, lag(iso2) == iso2)) %>%
  filter(!same_iso2) %>%
  select(iso1, same_iso1, iso2, same_iso2, everything())
## Joining, by = c("iso2", "iso1")
df_mutations_phenotypes_mortality_indep <- snippy_data_modified_proteins %>%
  right_join(df_mortality_close_pairs_indep) 
## Joining, by = c("iso1", "iso2")
df_convergence_mortality_indep <- df_mutations_phenotypes_mortality_indep %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n = n_distinct(PAIR_ID), 
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, everything()) %>%
  arrange(clstr_prot, desc(n_references))

df_convergence_mortality_indep
## # A tibble: 524 x 116
## # Groups:   clstr_prot, clstr_size_prot [403]
##    n_references     n clstr_prot clstr_size_prot GENE  PRODUCT MUTATION_SHORT
##           <int> <int>      <dbl>           <dbl> <chr> <chr>   <chr>         
##  1            3     3          0              61 ebh   Extrac… A2602E        
##  2            3     3          0              61 ebh   Extrac… F139S         
##  3            3     3          0              61 ebh   Extrac… Q7253fs       
##  4            1     1          5              23 <NA>  hypoth… F1903L        
##  5            3     3          7              73 tycC  Tyroci… L1024fs       
##  6            3     3          7              73 tycC  Tyroci… E1333K        
##  7            3     3          7              73 tycC  Tyroci… K2308fs       
##  8            1     1         11               6 <NA>  hypoth… S1502N        
##  9            1     1         13              54 <NA>  hypoth… G1212S        
## 10            1     1         15              24 essC  Type V… G1038D        
## # … with 514 more rows, and 109 more variables: PAIR_ID <chr>, REFERENCE <chr>,
## #   ISOLATE <chr>, CHROM <chr>, POS <dbl>, TYPE <chr>, REF <chr>, ALT <chr>,
## #   EVIDENCE <chr>, FTYPE <chr>, STRAND <chr>, NT_POS <chr>, AA_POS <dbl>,
## #   AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## #   LOCUS_TAG <chr>, iso1 <chr>, iso2 <chr>, length_prot <dbl>,
## #   clstr_rep_prot <dbl>, clstr_iden_prot <chr>, clstr_cov_prot <chr>,
## #   nebraska_locus_tag <chr>, neb_mutant_id <chr>, plate384 <dbl>,
## #   Well384 <chr>, neb_gene <chr>, neb_product <chr>, Row384 <chr>,
## #   Column384 <dbl>, nebraska_aa_seq <chr>, SEQUENCE_prot <chr>,
## #   same_iso1 <lgl>, same_iso2 <lgl>, dist <dbl>, iso1_ST <chr>,
## #   iso1_strain_group <chr>, iso1_mecA <dbl>, iso1_mortality <chr>,
## #   iso1_intrahost_index <dbl>, iso1_included <lgl>,
## #   iso1_unrelated_to_index <dbl>, iso1_mut_count <dbl>, iso1_scv <lgl>,
## #   iso1_vanco_difference <dbl>, iso1_persister_type <chr>,
## #   iso1_intrahost_sampledelay <dbl>, iso1_sample_type <chr>, iso1_genes <chr>,
## #   iso1_CC <chr>, iso2_ST <chr>, iso2_strain_group <chr>, iso2_mecA <dbl>,
## #   iso2_mortality <chr>, iso2_intrahost_index <dbl>, iso2_included <lgl>,
## #   iso2_unrelated_to_index <dbl>, iso2_mut_count <dbl>, iso2_scv <lgl>,
## #   iso2_vanco_difference <dbl>, iso2_persister_type <chr>,
## #   iso2_intrahost_sampledelay <dbl>, iso2_sample_type <chr>, iso2_genes <chr>,
## #   iso2_CC <chr>, CC <chr>, iso1_time_of_max_rate_OD <dbl>,
## #   iso1_max_rate_OD <dbl>, iso1_doubling_time_OD <dbl>, iso1_AUC_OD <dbl>,
## #   iso1_time_of_max_OD <dbl>, iso1_max_OD <dbl>, iso1_time_of_min_OD <dbl>,
## #   iso1_min_OD <dbl>, iso1_end_point_OD <dbl>,
## #   iso1_end_point_timepoint_OD <dbl>, iso1_AUC_death <dbl>,
## #   iso1_time_of_max_death <dbl>, iso1_max_death <dbl>,
## #   iso1_time_of_min_death <dbl>, iso1_min_death <dbl>,
## #   iso1_time_of_max_rate_death <dbl>, iso1_max_rate_death <dbl>,
## #   iso1_doubling_time_death <dbl>, iso1_end_point_death <dbl>,
## #   iso1_end_point_timepoint_death <dbl>, iso2_time_of_max_rate_OD <dbl>,
## #   iso2_max_rate_OD <dbl>, iso2_doubling_time_OD <dbl>, iso2_AUC_OD <dbl>,
## #   iso2_time_of_max_OD <dbl>, iso2_max_OD <dbl>, iso2_time_of_min_OD <dbl>,
## #   iso2_min_OD <dbl>, iso2_end_point_OD <dbl>,
## #   iso2_end_point_timepoint_OD <dbl>, iso2_AUC_death <dbl>,
## #   iso2_time_of_max_death <dbl>, …
n_distinct(df_convergence_mortality_indep$PAIR_ID)
## [1] 22
t_mortality_indep <- df_convergence_mortality_indep %>%
  distinct(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  group_by(n_references, clstr_prot, clstr_size_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|")))

Same with controls

df_mortality_close_pairs_indep_controls <- df_all_genetic_pairs_pheno_switches %>%
  right_join(df_close_pairs) %>%
  drop_na(switches) %>%
  filter(!switches %in% c("Survived-Died", "Died-Survived")) %>%
  arrange(iso1) %>%
  mutate(same_iso1 = if_else(row_number() == 1, FALSE, lag(iso1) == iso1)) %>%
  filter(!same_iso1) %>%
  arrange(iso2) %>%
  mutate(same_iso2 = if_else(row_number() == 1, FALSE, lag(iso2) == iso2)) %>%
  filter(!same_iso2) %>%
  select(iso1, same_iso1, iso2, same_iso2, everything())
## Joining, by = c("iso2", "iso1")
df_mutations_phenotypes_mortality_indep_controls <- snippy_data_modified_proteins %>%
  right_join(df_mortality_close_pairs_indep) 
## Joining, by = c("iso1", "iso2")
n_distinct(df_mutations_phenotypes_mortality_indep_controls$PAIR_ID)
## [1] 23
df_convergence_mortality_controls <- df_mutations_phenotypes_mortality_indep %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n = n_distinct(PAIR_ID), 
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, everything()) %>%
  arrange(clstr_prot, desc(n_references))

t_controls <- df_convergence_mortality_controls %>%
  distinct(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT) %>%
  group_by(n_references, clstr_prot, clstr_size_prot) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|")))

Ideas

unique mutations for each lethal strain

tree with mutations

as many switches as lethal strains

df_mortality_close_clusters <- df_all_genetic_pairs_pheno_switches %>%
  arrange(iso2) %>%
  filter(switches %in% c("Survived-Died")) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
  # rowwise() %>%
  # mutate(key = str_c(sort(c(iso1, iso2)), collapse = "")) %>%
  # select(iso1, iso2, switches, iso_cluster, key) %>%
  # distinct(key, .keep_all = T) %>%
  select(iso1, iso2, switches, iso_cluster)

df_mortality_close_clusters %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 28
df_mortality_close_clusters %>%
  ggplot(aes(x = fct_infreq(iso_cluster))) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

df_mutations_phenotypes_mortality_close_clusters <- snippy_data_modified_proteins %>%
  right_join(df_mortality_close_clusters) 
## Joining, by = c("iso1", "iso2")
df_convergence_mortality_clusters <- df_mutations_phenotypes_mortality_close_clusters %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n_clusters = n_distinct(iso_cluster), 
          n_pairs = n_distinct(PAIR_ID),
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, switches, everything()) %>%
  arrange(clstr_prot, desc(n_clusters))

t_mortality_clusters <- df_convergence_mortality_clusters %>%
  group_by(clstr_prot) %>%
  mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
  distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|")))%>%
  mutate(phenotype = "SAB mortality") %>%
  arrange(desc(n_clusters))
t_mortality_clusters
## # A tibble: 689 x 13
## # Groups:   n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot,
## #   nebraska_locus_tag, neb_mutant_id, neb_gene [689]
##    n_clusters n_pairs clstr_prot clstr_size_prot SEQUENCE_prot nebraska_locus_…
##         <int>   <int>      <dbl>           <dbl> <chr>         <chr>           
##  1         13      25          7              73 MIMGNLRFQQEY… SAUSA300_0181   
##  2         13      31        358              30 MLFHKKIESLIS… SAUSA300_2167   
##  3         11      11        469              10 MKFSTLSEEEFT… SAUSA300_1141   
##  4         11      13       1105              49 MSNQNYDYNKNE… SAUSA300_0754   
##  5         11      17        569              20 MKIIHTADWHLG… SAUSA300_1242   
##  6         11      22        211              25 MAKLLIMSIVSF… <NA>            
##  7         11      54        140              56 MKFKSLITTTLA… <NA>            
##  8         11      55          0              61 MNYRDKIQKFSI… SAUSA300_1327   
##  9         10      17         65              27 MNMKKKEKHAIR… <NA>            
## 10         10      26        435              66 MELLNRYNFVLF… SAUSA300_1991   
## # … with 679 more rows, and 7 more variables: neb_mutant_id <chr>,
## #   neb_gene <chr>, neb_product <chr>, GENE <chr>, PRODUCT <chr>,
## #   MUTATION_SHORT <chr>, phenotype <chr>

Convergence analysis of growth rate

Explore growth rate

df_phenotypes
## # A tibble: 234 x 111
##    iso2  iso1   dist iso1_ST iso1_strain_gro… iso1_mecA iso1_mortality
##    <chr> <chr> <dbl> <chr>   <chr>                <dbl> <chr>         
##  1 BPH2… BPH2…    21 93      VAN-004                  1 Survived      
##  2 BPH2… BPH2…    21 93      VAN-004                  1 Survived      
##  3 BPH2… BPH2…    20 97      VAN-007                  0 Survived      
##  4 BPH2… BPH2…    20 97      VAN-007                  0 Survived      
##  5 BPH2… BPH2…    18 239     VAN-064                  1 Survived      
##  6 BPH2… BPH3…    16 239     VSS-287                  1 Died          
##  7 BPH2… BPH2…    20 239     VAN-155                  1 Died          
##  8 BPH2… BPH2…    29 45      VAN-011                  0 Survived      
##  9 BPH2… BPH2…    29 45      VAN-011                  0 Survived      
## 10 BPH2… BPH2…    18 239     VAN-082                  1 Survived      
## # … with 224 more rows, and 104 more variables: iso1_intrahost_index <dbl>,
## #   iso1_included <lgl>, iso1_unrelated_to_index <dbl>, iso1_mut_count <dbl>,
## #   iso1_scv <lgl>, iso1_vanco_difference <dbl>, iso1_persister_type <chr>,
## #   iso1_intrahost_sampledelay <dbl>, iso1_sample_type <chr>, iso1_genes <chr>,
## #   iso1_CC <chr>, iso2_ST <chr>, iso2_strain_group <chr>, iso2_mecA <dbl>,
## #   iso2_mortality <chr>, iso2_intrahost_index <dbl>, iso2_included <lgl>,
## #   iso2_unrelated_to_index <dbl>, iso2_mut_count <dbl>, iso2_scv <lgl>,
## #   iso2_vanco_difference <dbl>, iso2_persister_type <chr>,
## #   iso2_intrahost_sampledelay <dbl>, iso2_sample_type <chr>, iso2_genes <chr>,
## #   iso2_CC <chr>, CC <chr>, iso1_time_of_max_rate_OD <dbl>,
## #   iso1_max_rate_OD <dbl>, iso1_doubling_time_OD <dbl>, iso1_AUC_OD <dbl>,
## #   iso1_time_of_max_OD <dbl>, iso1_max_OD <dbl>, iso1_time_of_min_OD <dbl>,
## #   iso1_min_OD <dbl>, iso1_end_point_OD <dbl>,
## #   iso1_end_point_timepoint_OD <dbl>, iso1_AUC_death <dbl>,
## #   iso1_time_of_max_death <dbl>, iso1_max_death <dbl>,
## #   iso1_time_of_min_death <dbl>, iso1_min_death <dbl>,
## #   iso1_time_of_max_rate_death <dbl>, iso1_max_rate_death <dbl>,
## #   iso1_doubling_time_death <dbl>, iso1_end_point_death <dbl>,
## #   iso1_end_point_timepoint_death <dbl>, iso2_time_of_max_rate_OD <dbl>,
## #   iso2_max_rate_OD <dbl>, iso2_doubling_time_OD <dbl>, iso2_AUC_OD <dbl>,
## #   iso2_time_of_max_OD <dbl>, iso2_max_OD <dbl>, iso2_time_of_min_OD <dbl>,
## #   iso2_min_OD <dbl>, iso2_end_point_OD <dbl>,
## #   iso2_end_point_timepoint_OD <dbl>, iso2_AUC_death <dbl>,
## #   iso2_time_of_max_death <dbl>, iso2_max_death <dbl>,
## #   iso2_time_of_min_death <dbl>, iso2_min_death <dbl>,
## #   iso2_time_of_max_rate_death <dbl>, iso2_max_rate_death <dbl>,
## #   iso2_doubling_time_death <dbl>, iso2_end_point_death <dbl>,
## #   iso2_end_point_timepoint_death <dbl>, delta_time_of_max_rate_OD <dbl>,
## #   delta_max_rate_OD <dbl>, delta_doubling_time_OD <dbl>, delta_AUC_OD <dbl>,
## #   delta_time_of_max_OD <dbl>, delta_time_of_min_OD <dbl>, delta_max_OD <dbl>,
## #   delta_min_OD <dbl>, delta_end_point_OD <dbl>,
## #   delta_time_of_max_rate_death <dbl>, delta_max_rate_death <dbl>,
## #   delta_doubling_time_death <dbl>, delta_AUC_death <dbl>,
## #   delta_time_of_max_death <dbl>, delta_time_of_min_death <dbl>,
## #   delta_max_death <dbl>, delta_min_death <dbl>, delta_end_point_death <dbl>,
## #   log2fc_time_of_max_rate_OD <dbl>, log2fc_max_rate_OD <dbl>,
## #   log2fc_doubling_time_OD <dbl>, log2fc_AUC_OD <dbl>,
## #   log2fc_time_of_max_OD <dbl>, log2fc_time_of_min_OD <dbl>,
## #   log2fc_max_OD <dbl>, log2fc_min_OD <dbl>, log2fc_end_point_OD <dbl>,
## #   log2fc_time_of_max_rate_death <dbl>, log2fc_max_rate_death <dbl>,
## #   log2fc_doubling_time_death <dbl>, log2fc_AUC_death <dbl>,
## #   log2fc_time_of_max_death <dbl>, log2fc_time_of_min_death <dbl>, …
for (var in str_subset(colnames(df_phenotypes), "iso1_.*_OD")){
  p <- ggdensity(data = df_phenotypes, x = var, fill = "red") +
    theme_bw()
  
  print(p)
}

df_phenotypes %>%
  ggplot(aes(x = abs(delta_AUC_OD))) +
  geom_density(fill = "red") +
  theme_bw()

df_plot <- df_phenotypes %>%
  mutate_at(vars(matches("delta_.*_OD")),
            abs)

for (var in str_subset(colnames(df_plot), "delta_.*_OD")){
  p <- ggdensity(data = df_plot, x = var, fill = "red") +
    theme_bw()
  
  print(p)
}

Generate dataset of clusters of discordant growth

Based on the exploration above, we define a delta AUC threshold of 30 for discordant pairs (60 would be a more stringent one)

df_growth_close_clusters <- df_phenotypes %>%
  arrange(iso2) %>%
  filter(delta_AUC_OD < - 10) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
  ungroup() %>%
  distinct(iso1, .keep_all =T)  %>%
  select(iso1, iso2, iso_cluster, delta_AUC_OD, iso1_AUC_OD, iso2_AUC_OD)

df_growth_close_clusters %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 19
df_growth_close_clusters %>%
  ggplot(aes(x = fct_infreq(iso_cluster))) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

Convergence analysis of growth

df_mutations_growth_close_clusters <- snippy_data_modified_proteins %>%
  right_join(df_growth_close_clusters) 
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_growth_close_clusters)
## [1] 1149
df_convergence_growth_clusters <- df_mutations_growth_close_clusters %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n_clusters = n_distinct(iso_cluster), 
          n_pairs = n_distinct(PAIR_ID),
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_OD, everything()) %>%
  arrange(clstr_prot, desc(n_clusters))

t_growth_clusters <- df_convergence_growth_clusters %>%
  group_by(clstr_prot) %>%
  mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
  distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|"))) %>%
  mutate(phenotype = "Growth rate (AUC)") %>%
  arrange(desc(n_clusters))

Convergence analysis of cytotoxicity

Explore cytotoxicity

for (var in str_subset(colnames(df_phenotypes), "iso1_.*_death")){
  p <- ggdensity(data = df_phenotypes, x = var, fill = "blue") +
    theme_bw()
  
  print(p)
}

df_plot <- df_phenotypes %>%
  mutate_at(vars(matches("delta_.*_death")),
            abs)

for (var in str_subset(colnames(df_plot), "delta_.*_death")){
  p <- ggdensity(data = df_plot, x = var, fill = "blue") +
    theme_bw()
  
  print(p)
}

Generate dataset of clusters of discordant cell death

df_cell_death_close_clusters <- df_phenotypes %>%
  arrange(iso2) %>%
  filter(delta_AUC_death < - 40) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
  ungroup() %>%
  distinct(iso1, .keep_all =T)  %>%
  select(iso1, iso2, iso_cluster, delta_AUC_death, iso1_AUC_death, iso2_AUC_death)


df_cell_death_close_clusters %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 12
df_cell_death_close_clusters %>%
  ggplot(aes(x = fct_infreq(iso_cluster))) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

Convergence analysis of cytotoxicity

df_mutations_cell_death_close_clusters <- snippy_data_modified_proteins %>%
  right_join(df_cell_death_close_clusters) 
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_cell_death_close_clusters)
## [1] 838
df_convergence_cell_death_clusters <- df_mutations_cell_death_close_clusters %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n_clusters = n_distinct(iso_cluster), 
          n_pairs = n_distinct(PAIR_ID),
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_death, everything()) %>%
  arrange(clstr_prot, desc(n_clusters))

t_cell_death_clusters <- df_convergence_cell_death_clusters %>%
  group_by(clstr_prot) %>%
  mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
  distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|"))) %>%
  mutate(phenotype = "Cell death (AUC)")

Merge datasets

t <- bind_rows(t_mortality_clusters,
               t_growth_clusters,
               t_cell_death_clusters)

t_convergent <- t %>%
  ungroup() %>%
  select(clstr_prot, starts_with("neb"), n_clusters, phenotype) %>%
  pivot_wider(names_from = phenotype, names_prefix = "n_clusters_", values_from = n_clusters, values_fill = list(n_clusters = 0)) %>%
  left_join(t) %>%
   group_by_at(vars(clstr_prot, starts_with("n_clusters_"),
                    starts_with("neb"))) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|")))
## Joining, by = c("clstr_prot", "nebraska_locus_tag", "neb_mutant_id", "neb_gene", "neb_product")
write_csv(t_convergent,
          "Ideas_Grant_2020_analysis/Convergence_analysis_tables/convergence_analysis_clusters_reduntant_controls.csv")

Summarise data (for Venn diagram): all mutated genes

t_venn <- t_convergent %>%
  ungroup() %>%
  mutate_at(vars(starts_with("n_clusters")),
            funs(mutated = as.integer(. > 0))) %>%
  unite(col = "intersect", ends_with("mutated"), sep = "") %>%
  mutate(intersect = recode(intersect,
                            `111` = "all",
                            `100` = "SAB mortality only",
                            `110` = "SAB mortality and growth rate",
                            `101` = "SAB mortality and cell death",
                            `011` = "Growth rate and cell death",
                            `010` = "Growth rate only",
                            `011` = "Growth rate and cell death",
                            `001` = "Cell death only"))

t_venn_synthesis <- t_venn %>%
  count(intersect)

Summarise data (for Venn diagram): genes mutated at least 2 times (convergent)

t_venn_stringent <- t_convergent %>%
  ungroup() %>%
  mutate_at(vars(starts_with("n_clusters")),
            funs(mutated = as.integer(. > 1))) %>%
  unite(col = "intersect", ends_with("mutated"), sep = "") %>%
  filter(intersect != "000") %>%
  mutate(intersect = recode(intersect,
                            `111` = "all",
                            `100` = "SAB mortality only",
                            `110` = "SAB mortality and growth rate",
                            `101` = "SAB mortality and cell death",
                            `011` = "Growth rate and cell death",
                            `010` = "Growth rate only",
                            `011` = "Growth rate and cell death",
                            `001` = "Cell death only"))

t_venn__stringent_synthesis <- t_venn_stringent %>%
  count(intersect)

Import tree

library(ggtree)
## ggtree v2.0.1  For help: https://yulab-smu.github.io/treedata-book/
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## - Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi: 10.1093/molbev/msy194
## - Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
## 
## Attaching package: 'ggtree'
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## The following object is masked from 'package:magrittr':
## 
##     inset
## The following object is masked from 'package:tidyr':
## 
##     expand
tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/VANANZ_phenotypes_n843.tree") %>%
  phytools::midpoint.root()
ggtree(tree, layout = "circular")

Show pairs on the tree

mortality_close_clusters_isolates_iso1 <- df_mortality_close_clusters %>%
  separate(switches, into = c("iso1_mortality", "iso2_mortality")) %>%
  ungroup() %>%
  select(starts_with("iso1")) %>%
  rename(ISOLATE = iso1, mortality = iso1_mortality)

mortality_close_clusters_isolates_iso2 <- df_mortality_close_clusters %>%
  separate(switches, into = c("iso1_mortality", "iso2_mortality")) %>%
  ungroup() %>%
  select(starts_with("iso2")) %>%
 rename(ISOLATE = iso2, mortality = iso2_mortality)

mortality_close_clusters_isolates <- bind_rows(mortality_close_clusters_isolates_iso1,
                                               mortality_close_clusters_isolates_iso2) %>%
  distinct()

ggtree(tree, layout = "circular") %<+% mortality_close_clusters_isolates +
  geom_point2(aes(subset = !is.na(mortality), colour = mortality))